################################################################################################
## Replication File for 
## "Capturing Clicks: How the Chinese Government Uses Clickbait to Compete for Visibility"
## Yingdan Lu and Jennifer Pan
## April, 2020
################################################################################################

################################################################################################
## Topic modeling
################################################################################################

######### Set Up #########
#install.packages("quanteda")
library(quanteda) #version 1.5.1
#install.packages("jiebaR")
library(jiebaR) #version 0.10.99
#install.packages("stm")
library(stm) #version 1.3.3
#install.packages("lubridate")
library(lubridate) #version 1.7.4
#install.packages("dplyr")
library(dplyr) #version 0.8.1
#install.packages("ggplot2")
library(ggplot2) #version 3.2.1
#install.packages("extrafont")
library(extrafont) #version 0.17

# load fonts
loadfonts()

setwd("..")

# import the data
df <- read.csv("data/sample_posts.csv", header = T, encoding = "UTF-8", stringsAsFactors = F)

# preprocess the data
df$date_pek <- mdy(df$date_pek)
df$excl <- ifelse(df$excl_mark >= 1,1,0)
df$que <- ifelse(df$question_mark >= 1,1,0)
df$ell <- ifelse(df$ellipsis_mark >= 1,1,0)
df$pron <- ifelse(df$pronoun_num >= 1,1,0)
df$phrase <- ifelse(df$phrases_num >= 1,1,0)
df$bait <- ifelse(apply(df[,c("listicles","gennn","hyperbolic","slang","excl", "que",
                              "ell", "pron", "phrase")] >= 1,1,any),1,0)

######### STM Topic Modeling #########
# preprocess the title texts and deduplication
cutter= worker(stop_word = "data/stopword.txt", symbol = F, bylines = T)
text = df$title
results_all <- segment(text, cutter)#segment
text <- as.data.frame(sapply(sapply(results_all, function(x){gsub("[a-zA-Z.0-9]+", "", x)}), function(x){paste(x, collapse = " ")}),stringsAsFactors = FALSE)#combine the segmented things
text[text== "" | text == "  " | text == ". "] <- NA
df$text <- text[,1]
dt_lda <- df[!is.na(df$text),]
dt_lda <- distinct(dt_lda, dt_lda$text, .keep_all= TRUE)

# contruct the document matrix and affiliate the metadata
ltokens <- corpus(dt_lda$text, docvars = dt_lda)
ldfm <- dfm(ltokens,remove_punct = TRUE)
out <- convert(ldfm, to = "stm", omit_empty = FALSE, docvars = dt_lda)
docs<-out$documents
vocab<-out$vocab
meta <- out$meta

# conduct topic modeling
topic_model <- stm(docs, vocab, K=30,prevalence=~bait, max.em.its=100, seed = 15,data = meta)

# save the top keywords and top 10 related titles to a dataframe
topic_keywords <- data.frame(apply(labelTopics(topic_model, n = 7)[["prob"]], 
                        1,paste, collapse = " "))
topic_posts <- matrix(NA, 10,0)
for (i in 1:30){
  thought = findThoughts(topic_model, n=10, texts = meta$title, topics = i)$docs
  t <- as.data.frame(thought)
  topic_posts <- cbind(topic_posts, t)
}
topic_posts <- cbind(topic_keywords,t(topic_posts))

######### Figure 3 #########
# plot the topic proportion figure
topic_labels <- c("Social security, welfare policies", 
                  "National media coverage of locality",
                  "Model citizen stories", 
                  "Extreme weather", 
                  "Subway, highway, rail construction",
                  "Crime and punishment",
                  "Motivational messages",
                  "Customs and festivals",
                  "Notable events",
                  "Local government meetings",
                  "Local cultural events",
                  "Weather forecast",
                  "College entrance examination", 
                  "Propaganda slogans for local development",
                  "Local history, culture",
                  "Advice on daily life",
                  "Local wins in nat'l/int'l competition",
                  "Local government activities", 
                  "cannot label",
                  "Local recognition by upper-level government",
                  "Six-city policy",
                  "Local claims to fame",
                  "Tourism and travel", 
                  "Local news",
                  "Public transportation, travel advisories", 
                  "Local officials' activities",
                  "Local implementation of central propaganda",
                  "Local implementation of nat'l anti-gang campaign",
                  "Advice on healthy living and safety",
                  "Local construction projects")

# calculate the topic proportions and save into a dataframe
prop_dt <- data.frame(colMeans(make.dt(topic_model))[-1], topic_labels)
colnames(prop_dt) <- c("prop", "label")
prop_dt$label <- as.factor(prop_dt$label)

# move the "cannot label" topic to the bottom
cannot_label <- prop_dt[which(prop_dt$label=="cannot label"),]
plot_data <- prop_dt[which(prop_dt$label!="cannot label"),]
plot_data <- plot_data[order(plot_data$prop, decreasing = T),]
plot_data <- rbind(plot_data, cannot_label)

# add factor variable to fix the topic ranking
plot_data$label <- factor(plot_data$label, 
                          levels=rev(c("Local government activities", 
                                       "Local recognition by upper-level government",
                                       "Social security, welfare policies", 
                                       "Local officials' activities", 
                                       "Local government meetings",
                                       "Public transportation, travel advisories", 
                                       "Local cultural events",
                                       "Advice on healthy living and safety", 
                                       "Propaganda slogans for local development",
                                       "Tourism and travel", 
                                       "Crime and punishment",
                                       "Subway, highway, rail construction", 
                                       "Local news", 
                                       "Weather forecast", 
                                       "National media coverage of locality", 
                                       "Local implementation of nat'l anti-gang campaign",
                                       "Motivational messages", 
                                       "Six-city policy", 
                                       "Notable events",
                                       "Local wins in nat'l/int'l competition", 
                                       "College entrance examination",
                                       "Advice on daily life", 
                                       "Customs and festivals", 
                                       "Extreme weather", 
                                       "Local claims to fame",
                                       "Local implementation of central propaganda", 
                                       "Local history, culture", 
                                       "Model citizen stories", 
                                       "Local construction projects",
                                       "cannot label")))
# add the meta-label
plot_data$propaganda <- c("Topics related to propaganda", 
                          "Topics related to propaganda", 
                          "Topics related to propaganda",
                          "Topics related to propaganda", 
                          "Topics related to propaganda", 
                          "Topics unrelated to propaganda", 
                          "Topics unrelated to propaganda", 
                          "Topics unrelated to propaganda", 
                          "Topics related to propaganda",
                           "Topics unrelated to propaganda", 
                          "Topics related to propaganda",
                          "Topics related to propaganda",
                          "Ambiguous topics",
                          "Topics unrelated to propaganda",
                          "Ambiguous topics",
                          "Topics related to propaganda",
                          "Ambiguous topics",
                          "Topics related to propaganda",
                          "Ambiguous topics",
                          "Ambiguous topics",
                          "Topics unrelated to propaganda",
                          "Topics unrelated to propaganda",
                          "Topics unrelated to propaganda",
                          "Topics unrelated to propaganda",
                          "Topics unrelated to propaganda",
                          "Topics related to propaganda",
                          "Ambiguous topics",
                          "Ambiguous topics",
                          "Topics related to propaganda",
                          "cannot label")

# calculate the overall topic proportions in meta-label categories
topic_sum_prop <- as.data.frame(plot_data %>%
                                  group_by(propaganda)%>%
                                  summarise(sum_prop = sum(prop)))
print(topic_sum_prop)

# add the class variable to the dataframe
plot_data$propaganda <- factor(plot_data$propaganda, levels=c("Topics related to propaganda",
                                                              "Ambiguous topics", 
                                                              "Topics unrelated to propaganda",
                                                              "cannot label"
                                                              ))
# plot the topic proportion
ggplot(plot_data, aes(x=label, y=prop, fill = propaganda, color = propaganda)) +
  geom_bar(stat="identity", width = 0.5) + coord_flip()+
  scale_fill_manual(breaks=c("Topics related to propaganda",
                             "Ambiguous topics", 
                             "Topics unrelated to propaganda"), 
                    values = c("black", "#939393", "#D3D3D3", "white"), name ="")+
  scale_color_manual(breaks=c("Topics related to propaganda",
                              "Ambiguous topics", 
                              "Topics unrelated to propaganda",
                              "cannot label"), 
                     values = c("black", "#939393", "#D3D3D3", "#D3D3D3"), name ="")+
  ylim(0, 0.2)+
  ylab("Expected Topic Proportions") + xlab("") +
  theme_bw(base_size=16, base_family='Times New Roman') +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
  theme(axis.title.x = element_text(size=16, colour="black"), 
        axis.title.y = element_text(size=16, colour="black"),
        text = element_text(size=16, colour="black"),
        axis.text.x = element_text(size=16, colour="black"), 
        axis.text.y = element_text(size=16, colour="black"),
        legend.position = c(0.6, 0.5), legend.text = element_text(size=14, colour="black"),
        legend.key.size=unit(0.7, "cm"))+guides(colour=FALSE)


######### Appendix Figure A2 (May take several hours to run) #########
## select the optimal number of topics and plot
# K<-c(10,15,20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80)
# kresult_all <- searchK(docs, vocab, K,prevalence=~bait, data = meta, heldout.seed = 15)
# plot(kresult_all, text.cex = 1)
## When the number of topic = 30, there is an overall higher performance on all criteria
## Thus, we generate top 30 topics (please set seed = 15, this is the seed number selected by the stm model when we ran it)


######### Appendix Table A3 #########
topic_labels <- data.frame(cbind(seq(1,30),topic_labels,
                                   apply(labelTopics(topic_model, n = 7)[["prob"]], 
                                                   1,paste, collapse = " ")))
colnames(topic_labels) <- c("ID", "Topic Label", "Words (Chinese)")
print(topic_labels)
